In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
In [2]:
from IPython.display import Image
Image('images/decision-tree.png')
Out[2]:
Let's now look at an example of this.
In [3]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=300, centers=4,
random_state=0, cluster_std=1.0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow');
This figure presents a visualization of the first four levels of a decision tree classifier for this data:
In [8]:
from IPython.display import Image
Image('images/decision_tree.png')
Out[8]:
Why is the uppermost branch not split?
In Scikit-Learn a DecisionTreeClassifier
estimator is used to construct Decision Trees.
In [9]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier().fit(X, y)
In [10]:
def visualize_classifier(model, X, y, ax=None, cmap='rainbow'):
ax = ax or plt.gca()
# Plot the training points
ax.scatter(X[:, 0], X[:, 1], c=y, s=30, cmap=cmap,
clim=(y.min(), y.max()), zorder=3)
ax.axis('tight')
ax.axis('off')
xlim = ax.get_xlim()
ylim = ax.get_ylim()
# fit the estimator
model.fit(X, y)
xx, yy = np.meshgrid(np.linspace(*xlim, num=200),
np.linspace(*ylim, num=200))
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
# Create a color plot with the results
n_classes = len(np.unique(y))
contours = ax.contourf(xx, yy, Z, alpha=0.3,
levels=np.arange(n_classes + 1) - 0.5,
cmap=cmap, clim=(y.min(), y.max()),
zorder=1)
ax.set(xlim=xlim, ylim=ylim)
In [13]:
visualize_classifier(DecisionTreeClassifier(), X, y)
In [22]:
from helper.interactive_tree import plot_tree_interactive
plot_tree_interactive(X, y);
What do you think is the best classifier?
In [24]:
from IPython.display import Image
Image('images/decision-tree-overfitting.png')
Out[24]:
In [25]:
from helper.interactive_tree import randomized_tree_interactive
randomized_tree_interactive(X, y)
This type of bagging classification can be done manually using Scikit-Learn's BaggingClassifier
meta-estimator, as shown here:
In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
tree = DecisionTreeClassifier()
bag = BaggingClassifier(tree, n_estimators=100, max_samples=0.8, random_state=1)
bag.fit(X, y)
visualize_classifier(bag, X, y)
We've built a Random Forest by hand. But Scikit-Learn comes with a RandomForestClassifier
estimator which is easier to handle:
In [31]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=0)
visualize_classifier(model, X, y);
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: